{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "公众号 = \"腾讯媒体研究院\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random\n",
    "from requests_html import HTMLSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# coding=utf-8\n",
    "from selenium import webdriver\n",
    "import time\n",
    "\n",
    "wd = webdriver.Chrome()\n",
    "wd.get(\"https://www.baidu.com\")    # 打开百度浏览器\n",
    "wd.find_element_by_id(\"kw\").send_keys(\"selenium\")   # 定位输入框并输入关键字\n",
    "wd.find_element_by_id(\"su\").click()   #点击[百度一下]搜索\n",
    "time.sleep(3)   #等待3秒\n",
    "wd.quit()   #关闭浏览器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-5-8ae4025e7ff4>:19: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.自动化登录（需扫码）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.1自动化账号密码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload =  {\"account\": \"\", \"password\": \"\"}\n",
    "# payload =  {\"account\": \"NFUHacks@163.com\", \"password\": \"NFU706947580\"}\n",
    "# 切换为账号密码登录\n",
    "driver.find_element_by_xpath('//div[@class=\"login__type__container login__type__container__scan\"]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 清空账号input\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').clear()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').send_keys(payload['account'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 清空密码input\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').clear()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//div[@class=\"login_btn_panel\"]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'展开'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 2.3 找选单\n",
    "element = driver.find_element_by_xpath('//a[@id=\"m_open\"]')\n",
    "element.click()\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.execute_script(\"window.scrollTo(0,document.body.scrollHeight)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# * 图文素材——>新建图文模版——>超链接——>选择其他公众号——>input——>搜索        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://mp.weixin.qq.com/cgi-bin/appmsg?begin=0&count=10&type=10&action=list_card&token=85241463&lang=zh_CN'"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"js_mp_sidemenu_pop\"]/div[2]/div/div/ul/li[2]/ul/li[1]/ul/li[1]/a') \n",
    "\n",
    "\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "main_content\n",
    "url2= element.get_attribute(\"href\")\n",
    "url2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(url2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div[1]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "main_content\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<selenium.webdriver.remote.webelement.WebElement (session=\"5e928ad3becf4ffc8b932de9e9967472\", element=\"6c0ef886-e120-496a-b5fd-2a5c7c2170d4\")>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div[1]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CDwindow-5DAB6490702121B8D036350DED4D04BC', 'CDwindow-9F32A528BDA9ACA2E687F15319147CAC']\n"
     ]
    }
   ],
   "source": [
    "print (driver.window_handles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.switch_to.window(driver.window_handles[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "超链接\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"js_editor_insertlink\"]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "选择其他公众号\n"
     ]
    }
   ],
   "source": [
    "# 点 选择其他公众号\n",
    "element = driver.find_element_by_xpath('//*[@id=\"vue_app\"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/p/div/button') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').clear()\n",
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').send_keys(公众号)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-icon weui-desktop-icon__search weui-desktop-icon__small\" style=\"width: 20px; height: 20px;\"><!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!---->     <svg viewBox=\"0 0 24 24\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><title>MP/Icon/Search</title> <g id=\"MP/Icon/Search\" stroke=\"none\" stroke-width=\"1\" fill=\"none\" fill-rule=\"evenodd\"><path d=\"M5.78025253,5.78248558 C8.51392257,3.04881554 12.9460774,3.04881554 15.6797475,5.78248558 C18.1730922,8.27583028 18.3922898,12.1821488 16.3373403,14.9239313 L20.6294949,19.2175144 L19.2152814,20.631728 L14.922508,16.3389663 C12.180685,18.394566 8.27384272,18.1755707 5.78025253,15.6819805 C3.04658249,12.9483105 3.04658249,8.51615562 5.78025253,5.78248558 Z M6.8409127,6.84314575 C4.6930291,8.99102935 4.6930291,12.4734367 6.8409127,14.6213203 C8.98879631,16.7692039 12.4712037,16.7692039 14.6190873,14.6213203 C16.7669709,12.4734367 16.7669709,8.99102935 14.6190873,6.84314575 C12.4712037,4.69526215 8.98879631,4.69526215 6.8409127,6.84314575 Z\" id=\"形状\"></path></g></svg> <!----> <!----> <!----> <!----> <!----></div>\n"
     ]
    }
   ],
   "source": [
    "# 点放大镜搜\n",
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-icon-btn weui-desktop-search__btn\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/cdSLicavGUr23RakfD1j8csT3LMhYpOQ21ECXeZMpic1k1dTBWprafgazS97J5JaVCHNQr4ib0cuSGVDWc9rJUiatQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">腾讯媒体研究院</strong> <i class=\"inner_link_account_wechat\">微信号：TencentMRI</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/gRl4QnJRXWbgpDlacGvtd3icLOwstUj5eIRYLZyPVUBGV1BZAGjve7t7IX5tvLibMic2mwkiam9IibyDv50M02RNjxQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">腾讯芒种特训营</strong> <i class=\"inner_link_account_wechat\">微信号：mz_camp</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">服务号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/HH8uRF2v4zyt0JfL70RQGMOGbagyprO18gIJaSJnvnicMvxquKlUMbRd7bhWicA36EGlxiaka0eVQjyiaKsAHwiaUdA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">全媒派</strong> <i class=\"inner_link_account_wechat\">微信号：quanmeipai</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/Sicic1OQOtC111vJbaG1UBiakzlTBHLAlW7VK6v51mvbN4ruWh2SwUe2BPceg4PUKznGJ57jANU5UpzBCpEjMa35A/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">腾讯媒体生态研究院</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/9iaB4A0N22VWh097F7WODaBmz6g2ddOfQhQ09k5ML4DP90q50PUe8WicADIQmEEqhYTONC9Ya4MEtONxp3PHKTgQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">中国吉林网</strong> <i class=\"inner_link_account_wechat\">微信号：jl-news</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "公众号SERP = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 解析\n",
    "root = fromstring(公众号SERP) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "主 = root.xpath('//li[@class=\"inner_link_account_item\"]')\n",
    "\n",
    "account_list = []\n",
    "for e in 主:\n",
    "    account_nickname = e.xpath('./div/strong[@class=\"inner_link_account_nickname\"]')[0].text\n",
    "    account_wechat = e.xpath('./div/i[@class=\"inner_link_account_wechat\"]')[0].text\n",
    "    account_img = e.xpath('./div/img/@src')[0]\n",
    "    account = {\"nickname\": account_nickname, \"wechat\": account_wechat, \"img\": account_img,}\n",
    "    account_list.append(account)\n",
    "\n",
    "df_account = pd.DataFrame(account_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nickname</th>\n",
       "      <th>wechat</th>\n",
       "      <th>img</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>腾讯媒体研究院</td>\n",
       "      <td>微信号：TencentMRI</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/cdSLicavGUr23Ra...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>腾讯芒种特训营</td>\n",
       "      <td>微信号：mz_camp</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/gRl4QnJRXWbgpDl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>全媒派</td>\n",
       "      <td>微信号：quanmeipai</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/HH8uRF2v4zyt0Jf...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>腾讯媒体生态研究院</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/Sicic1OQOtC111v...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>中国吉林网</td>\n",
       "      <td>微信号：jl-news</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/9iaB4A0N22VWh09...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    nickname          wechat  \\\n",
       "0    腾讯媒体研究院  微信号：TencentMRI   \n",
       "1    腾讯芒种特训营     微信号：mz_camp   \n",
       "2        全媒派  微信号：quanmeipai   \n",
       "3  腾讯媒体生态研究院         微信号：未设置   \n",
       "4      中国吉林网     微信号：jl-news   \n",
       "\n",
       "                                                 img  \n",
       "0  http://mmbiz.qpic.cn/mmbiz_png/cdSLicavGUr23Ra...  \n",
       "1  http://mmbiz.qpic.cn/mmbiz_png/gRl4QnJRXWbgpDl...  \n",
       "2  http://mmbiz.qpic.cn/mmbiz_png/HH8uRF2v4zyt0Jf...  \n",
       "3  http://mmbiz.qpic.cn/mmbiz_png/Sicic1OQOtC111v...  \n",
       "4  http://mmbiz.qpic.cn/mmbiz_png/9iaB4A0N22VWh09...  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/cdSLicavGUr23RakfD1j8csT3LMhYpOQ21ECXeZMpic1k1dTBWprafgazS97J5JaVCHNQr4ib0cuSGVDWc9rJUiatQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">腾讯媒体研究院</strong> <i class=\"inner_link_account_wechat\">微信号：TencentMRI</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]/li')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n跳转_input = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/input\\')\\n跳转_a = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/a\\')\\n跳转_input.clear()\\n跳转_input.send_keys(2)\\n跳转_a.click()\\n'"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 跳转testing\n",
    "'''\n",
    "跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "跳转_input.clear()\n",
    "跳转_input.send_keys(2)\n",
    "跳转_a.click()\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 151]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "l_e = driver.find_elements_by_xpath('//label[@class=\"weui-desktop-pagination__num\"]')\n",
    "l_e_int  = [int(x.text) for x in l_e] \n",
    "print (l_e_int)\n",
    "print (l_e_int[0]==l_e_int[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(l_e_int[0],51 ))\n",
    "#print(pages[0:2])\n",
    "pages = list(range(1,51))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 爬取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "\n",
    "        time.sleep(45+120*random())\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        #print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t50\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "2   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "3   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "4   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "5   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "6   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "7   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "8   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "9   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "10  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "11  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "12  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "13  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "14  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "15  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "16  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "17  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "18  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "19  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "20  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "21  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "22  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "23  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "24  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "25  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "26  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "27  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "28  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "29  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "30  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "31  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "32  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "33  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "34  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "35  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "36  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "37  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "38  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "39  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "40  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "41  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "42  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "43  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "44  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "45  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "46  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "47  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "48  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "49  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "50  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stored 'html_raw' (dict)\n"
     ]
    }
   ],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [html_snippets]\n",
       "Index: []"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"原文件\"\n",
    "df_out.to_csv(filename.format(公众号=公众号), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7,7,5,5,5,6,7,5,5,5,6,6,6,5,6,6,6,7,5,7,7,5,5,5,5,5,5,8,7,6,6,9,5,6,6,9,8,5,6,6,5,5,5,5,6,7,6,5,9,7,"
     ]
    }
   ],
   "source": [
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x.text for x in root.xpath('//div[@class=\"inner_link_article_title\"]/span[2]')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    text = [get_text(x) for x in link]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\":create_time, \"link\":link, \"text\":text})\n",
    "    return(_df_)\n",
    "\n",
    "def get_text(link):\n",
    "    session = HTMLSession()\n",
    "    r = session.get(url=link)\n",
    "    text_xpath_1 = '//*[@id=\"js_content\"]//span/text()'\n",
    "    text_xpath_2 = '//*[@id=\"js_content\"]//p/text()'\n",
    "    text_1 = ''.join(r.html.xpath(text_xpath_1))\n",
    "    text_2 = ''.join(r.html.xpath(text_xpath_2))\n",
    "    return text_1 + text_2\n",
    "\n",
    "\n",
    "l_df = []\n",
    "for p in pages:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>地震、马拉松事故，应急报道注意这六大问题！丨芒种·观点</td>\n",
       "      <td>2021-05-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>灾难、事故的报道，通常要经历两个阶段，第一阶段报道是“拼手速”、求准确，简单告知受众新闻要素...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>腾讯媒体研究院招聘新媒体运营实习生丨招聘</td>\n",
       "      <td>2021-05-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>点击上方蓝色文字关注“腾讯媒体研究院”腾讯媒体研究院正在招聘新媒体运营实习生，以下为我们的简...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>“夜听”合伙人肖涛：不同内容形态下IP升级的方法 丨芒种·案例</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>点击上方蓝色文字关注“腾讯媒体研究院”在4月的腾讯芒种特训营广州公开课上，”合伙人肖涛以《新...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>影视娱乐账号的选题及人格化属性如何打造？“短视频实操密码”专题公开课等你加入丨学员招募</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>点击上方蓝色文字关注“腾讯媒体研究院”回顾内容产业的发展，游戏、搞笑内容一直是内容平台所追捧...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>微短剧的“春天”是个假象吗？丨芒种·趋势</td>\n",
       "      <td>2021-05-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>腾讯研究院研究员、博士后腾讯研究院助理研究员行业之选：微短剧风口背后的两个动力接触微短剧之初...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>在视频内容中「消失」的2.64亿老年人丨芒种·趋势</td>\n",
       "      <td>2021-05-21</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>针对老年人的视频娱乐内容数量少、品质低，还没有得到应有的重视。在基本完成对年轻人的渗透之后，...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>“宇宙中心”曹县走红，如何借助流量打造“城设”？丨芒种·案例</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>这两天，曹县登上社交平台热搜，“曹县是什么梗”“菏泽曹县666”“山东菏泽曹县”等一时间成为...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>芒果MCN李志华：试错、重置、转向，电视转战新媒体主场的N种方式丨芒种·案例</td>\n",
       "      <td>2021-05-19</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>理解新发展阶段的媒体生态在执行层面，大部分广电同行对媒介生态的变化仍缺乏统一认知，概念混淆、...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>游戏视频剪辑配音有何技巧？“短视频实操密码”专题公开课等你加入丨学员招募</td>\n",
       "      <td>2021-05-19</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>点击上方蓝色文字关注“腾讯媒体研究院”回顾内容产业的发展，游戏、搞笑内容一直是内容平台所追捧...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>“DV现场”柯鑫：解读本地热点新闻的生产链条丨芒种·案例</td>\n",
       "      <td>2021-05-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>点击上方蓝色文字关注“腾讯媒体研究院”在4月的腾讯芒种特训营广州公开课上，”内容负责人以《从...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>面对海啸般舆情，真相为何姗姗来迟？丨芒种·观点</td>\n",
       "      <td>2021-05-13</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>今早，新华社发布长文《新华社记者：还原成都49中学生坠亡事件》还原了成都49中学生坠亡事件，...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>UP主谭乔：在主流之外丨芒种·人物</td>\n",
       "      <td>2021-05-12</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>《谭谈交通》停播之后，节目片段被UP主搬上视频网站，爆红网络，豆瓣评分高达9.6分，谭乔转型...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>如何花式剪辑打造搞笑视频？</td>\n",
       "      <td>2021-05-12</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>点击上方蓝色文字关注“腾讯媒体研究院”回顾内容产业的发展，游戏、搞笑内容一直是内容平台所追捧...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>内容行业百年涌现简史丨芒种·观点</td>\n",
       "      <td>2021-05-11</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>作者丨知乎战略副总裁张宁刚刚过去的二十世纪，世界经历了战乱、崩溃和不断加速的技术革命。回看人...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>游戏、搞笑类账号如何脱颖而出？“短视频实操密码”专题公开课等你加入丨学员招募</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>点击上方蓝色文字关注“腾讯媒体研究院”回顾内容产业的发展，游戏、搞笑内容一直是内容平台所追捧...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>如何成为一名国内顶级内容创作者？丨芒种·观点</td>\n",
       "      <td>2021-05-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>到底什么文章是好的文章？什么书是好书？什么内容是好的内容？4月10日开始，马徐骏「CCOC首...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>用虚拟“取代”现实，这场“梦”该醒了吗？丨芒种·观点</td>\n",
       "      <td>2021-05-06</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>1992年，科幻作家Neal Stephenson在其著作《Snow Crash》中首次提出...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>华人剧荟总裁李捷文：如何打造网络小说IP的爆款剧集？丨芒种·案例</td>\n",
       "      <td>2021-04-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>点击上方蓝色文字关注“腾讯媒体研究院”4月29日，由腾讯大学出品的《临厂发挥》第二季第四集正...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>公号大V“下海” 去B站丨创作者访谈</td>\n",
       "      <td>2021-04-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>点击上方蓝色文字关注“腾讯媒体研究院”“创作者访谈”是腾讯媒体研究院推出的一档创作者访谈栏目...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>“征探财经”创始人周远征：穿透式调查报道背后的隐秘通途丨传媒前线</td>\n",
       "      <td>2021-04-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>点击上方蓝色文字关注“腾讯媒体研究院”“传媒前线”是腾讯媒体研究院推出的一档人物对话栏目。每...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>互联网时代的创作：我们注定不会再有伟大的作品了吗？丨芒种·观点</td>\n",
       "      <td>2021-04-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...</td>\n",
       "      <td>上网超过十年的用户可能都会有相似的观察——内容界面里，“数字”越来越多，也越来越可见了。在W...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          title create_time  \\\n",
       "0                   地震、马拉松事故，应急报道注意这六大问题！丨芒种·观点  2021-05-26   \n",
       "1                          腾讯媒体研究院招聘新媒体运营实习生丨招聘  2021-05-26   \n",
       "2               “夜听”合伙人肖涛：不同内容形态下IP升级的方法 丨芒种·案例  2021-05-25   \n",
       "3   影视娱乐账号的选题及人格化属性如何打造？“短视频实操密码”专题公开课等你加入丨学员招募  2021-05-25   \n",
       "4                          微短剧的“春天”是个假象吗？丨芒种·趋势  2021-05-24   \n",
       "5                     在视频内容中「消失」的2.64亿老年人丨芒种·趋势  2021-05-21   \n",
       "6                “宇宙中心”曹县走红，如何借助流量打造“城设”？丨芒种·案例  2021-05-20   \n",
       "7        芒果MCN李志华：试错、重置、转向，电视转战新媒体主场的N种方式丨芒种·案例  2021-05-19   \n",
       "8          游戏视频剪辑配音有何技巧？“短视频实操密码”专题公开课等你加入丨学员招募  2021-05-19   \n",
       "9                  “DV现场”柯鑫：解读本地热点新闻的生产链条丨芒种·案例  2021-05-14   \n",
       "10                      面对海啸般舆情，真相为何姗姗来迟？丨芒种·观点  2021-05-13   \n",
       "11                            UP主谭乔：在主流之外丨芒种·人物  2021-05-12   \n",
       "12                                如何花式剪辑打造搞笑视频？  2021-05-12   \n",
       "13                             内容行业百年涌现简史丨芒种·观点  2021-05-11   \n",
       "14       游戏、搞笑类账号如何脱颖而出？“短视频实操密码”专题公开课等你加入丨学员招募  2021-05-10   \n",
       "15                       如何成为一名国内顶级内容创作者？丨芒种·观点  2021-05-07   \n",
       "16                   用虚拟“取代”现实，这场“梦”该醒了吗？丨芒种·观点  2021-05-06   \n",
       "17             华人剧荟总裁李捷文：如何打造网络小说IP的爆款剧集？丨芒种·案例  2021-04-29   \n",
       "18                           公号大V“下海” 去B站丨创作者访谈  2021-04-28   \n",
       "19             “征探财经”创始人周远征：穿透式调查报道背后的隐秘通途丨传媒前线  2021-04-27   \n",
       "20              互联网时代的创作：我们注定不会再有伟大的作品了吗？丨芒种·观点  2021-04-26   \n",
       "\n",
       "                                                 link  \\\n",
       "0   http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "1   http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "2   http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "3   http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "4   http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "5   http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "6   http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "7   http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "8   http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "9   http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "10  http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "11  http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "12  http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "13  http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "14  http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "15  http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "16  http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "17  http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "18  http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "19  http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "20  http://mp.weixin.qq.com/s?__biz=MzIzNzYwMzM3Ng...   \n",
       "\n",
       "                                                 text  \n",
       "0   灾难、事故的报道，通常要经历两个阶段，第一阶段报道是“拼手速”、求准确，简单告知受众新闻要素...  \n",
       "1   点击上方蓝色文字关注“腾讯媒体研究院”腾讯媒体研究院正在招聘新媒体运营实习生，以下为我们的简...  \n",
       "2   点击上方蓝色文字关注“腾讯媒体研究院”在4月的腾讯芒种特训营广州公开课上，”合伙人肖涛以《新...  \n",
       "3   点击上方蓝色文字关注“腾讯媒体研究院”回顾内容产业的发展，游戏、搞笑内容一直是内容平台所追捧...  \n",
       "4   腾讯研究院研究员、博士后腾讯研究院助理研究员行业之选：微短剧风口背后的两个动力接触微短剧之初...  \n",
       "5   针对老年人的视频娱乐内容数量少、品质低，还没有得到应有的重视。在基本完成对年轻人的渗透之后，...  \n",
       "6   这两天，曹县登上社交平台热搜，“曹县是什么梗”“菏泽曹县666”“山东菏泽曹县”等一时间成为...  \n",
       "7   理解新发展阶段的媒体生态在执行层面，大部分广电同行对媒介生态的变化仍缺乏统一认知，概念混淆、...  \n",
       "8   点击上方蓝色文字关注“腾讯媒体研究院”回顾内容产业的发展，游戏、搞笑内容一直是内容平台所追捧...  \n",
       "9   点击上方蓝色文字关注“腾讯媒体研究院”在4月的腾讯芒种特训营广州公开课上，”内容负责人以《从...  \n",
       "10  今早，新华社发布长文《新华社记者：还原成都49中学生坠亡事件》还原了成都49中学生坠亡事件，...  \n",
       "11  《谭谈交通》停播之后，节目片段被UP主搬上视频网站，爆红网络，豆瓣评分高达9.6分，谭乔转型...  \n",
       "12  点击上方蓝色文字关注“腾讯媒体研究院”回顾内容产业的发展，游戏、搞笑内容一直是内容平台所追捧...  \n",
       "13  作者丨知乎战略副总裁张宁刚刚过去的二十世纪，世界经历了战乱、崩溃和不断加速的技术革命。回看人...  \n",
       "14  点击上方蓝色文字关注“腾讯媒体研究院”回顾内容产业的发展，游戏、搞笑内容一直是内容平台所追捧...  \n",
       "15  到底什么文章是好的文章？什么书是好书？什么内容是好的内容？4月10日开始，马徐骏「CCOC首...  \n",
       "16  1992年，科幻作家Neal Stephenson在其著作《Snow Crash》中首次提出...  \n",
       "17  点击上方蓝色文字关注“腾讯媒体研究院”4月29日，由腾讯大学出品的《临厂发挥》第二季第四集正...  \n",
       "18  点击上方蓝色文字关注“腾讯媒体研究院”“创作者访谈”是腾讯媒体研究院推出的一档创作者访谈栏目...  \n",
       "19  点击上方蓝色文字关注“腾讯媒体研究院”“传媒前线”是腾讯媒体研究院推出的一档人物对话栏目。每...  \n",
       "20  上网超过十年的用户可能都会有相似的观察——内容界面里，“数字”越来越多，也越来越可见了。在W...  "
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out.loc[0:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_url_out.to_excel(\"腾讯媒体研究院.xlsx\", sheet_name=\"前五十页内容\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
